Show the code
import pandas as pd
import numpy as np
import sqlite3
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)import pandas as pd
import numpy as np
import sqlite3
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)sqlite_file = 'lahman_1871-2022.sqlite'
con = sqlite3.connect(sqlite_file)
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", con)This section of the analysis will have the purpose of seeing if salary expenditure on teams is related to post season success.
# Perform a query that returns the total salary, total wins, total losses, per year and team
base = pd.read_sql_query("""
WITH ps AS (
-- collect game wins for both winners and losers of each postseason series
SELECT yearID, teamIDwinner AS teamID, wins AS gwins FROM SeriesPost
UNION ALL
SELECT yearID, teamIDloser AS teamID, losses AS gwins FROM SeriesPost
),
ps_agg AS (
SELECT yearID, teamID, SUM(gwins) AS postseason_wins
FROM ps
GROUP BY yearID, teamID
),
ws AS (
SELECT yearID, teamIDwinner AS teamID, 1 AS won_ws
FROM SeriesPost
WHERE round = 'WS'
)
SELECT
t.yearID,
t.name,
SUM(s.salary) AS total_salary,
t.W AS wins,
t.L AS losses,
COALESCE(ps_agg.postseason_wins, 0) AS postseason_wins, -- <-- total PS wins (all rounds)
COALESCE(ws.won_ws, 0) AS won_world_series -- 1 if WS champ, else 0
FROM teams t
JOIN salaries s
ON t.teamID = s.teamID
AND t.yearID = s.yearID
LEFT JOIN ps_agg
ON ps_agg.teamID = t.teamID
AND ps_agg.yearID = t.yearID
LEFT JOIN ws
ON ws.teamID = t.teamID
AND ws.yearID = t.yearID
GROUP BY t.teamID, t.yearID, t.name, t.W, t.L, ps_agg.postseason_wins, ws.won_ws
ORDER BY t.yearID;
""", con)
# Calculate a wins per million dollars expended column
base['wins_per_million'] = base['wins'] / (base['total_salary'] / 1_000_000)
# Change won_world_series to boolean
base['won_world_series'] = base['won_world_series'].astype('bool')
# Convert Salary to millions
base['total_salary'] = base['total_salary'] / 1000000ggplot(base, aes(x="total_salary")) +\
geom_histogram()ggplot(base, aes(x="wins")) +\
geom_histogram()ggplot(base, aes(x="won_world_series", y='total_salary', fill='won_world_series')) +\
geom_boxplot() +\
scale_y_continuous(limits=(0,200))ggplot(base, aes(x="won_world_series", y='wins', fill='won_world_series')) +\
geom_boxplot() +\
scale_y_continuous(limits=(40,120))ggplot(base, aes(x="won_world_series", y='wins_per_million', fill='won_world_series')) +\
geom_boxplot() +\
scale_y_continuous(limits=(0,10))ggplot(base, aes(x="wins", y='total_salary', col='won_world_series')) +\
geom_point() +\
geom_smooth(method='lm') +\
scale_y_continuous(limits=(0,200))import statsmodels.formula.api as smf
model_simple = smf.ols(
"total_salary ~ wins + won_world_series",
data=base
).fit(cov_type="HC3")
print(model_simple.summary()) OLS Regression Results
==============================================================================
Dep. Variable: total_salary R-squared: 0.065
Model: OLS Adj. R-squared: 0.063
Method: Least Squares F-statistic: 34.40
Date: Sun, 28 Sep 2025 Prob (F-statistic): 3.93e-15
Time: 15:16:07 Log-Likelihood: -4730.6
No. Observations: 918 AIC: 9467.
Df Residuals: 915 BIC: 9482.
Df Model: 2
Covariance Type: HC3
============================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------
Intercept -13.1447 8.626 -1.524 0.128 -30.052 3.762
won_world_series[T.True] 4.6751 9.626 0.486 0.627 -14.191 23.541
wins 0.9135 0.114 8.006 0.000 0.690 1.137
==============================================================================
Omnibus: 137.193 Durbin-Watson: 0.773
Prob(Omnibus): 0.000 Jarque-Bera (JB): 212.963
Skew: 1.000 Prob(JB): 5.70e-47
Kurtosis: 4.251 Cond. No. 584.
==============================================================================
Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)